install.packages("Rtsne", dependencies = TRUE)
install.packages("uwot", dependencies = TRUE)
install.packages("ggdendro", dependencies = TRUE)
install.packages("ggrepel", dependencies = TRUE)
require(tidyverse)
require(magrittr)
require(xgboost)
require(Rtsne)
require(uwot)
library(ggdendro)
require(ggrepel)
Using MNIST (test set) as csv fromat was downloaded from :
https://github.com/pjreddie/mnist-csv-png
set.seed(1)
require(tidyverse)
require(Rtsne)
load("./input/mnist_sample.rda")
train.label <- mnist.sample[, 1]
train.matrix <- mnist.sample[, -1] %>% as.matrix
n <- NROW(train.matrix)
train.matrix %>% str(0)
#> int [1:10000, 1:784] 0 0 0 0 0 0 0 0 0 0 ...
#> - attr(*, "dimnames")=List of 2
according to : https://rdrr.io/cran/uwot/man/umap.html
plot.umap <- function(.umap, label = NULL, .colour = label, title = "") {
mapping.umap <- data.frame(
id = 1:NROW(.umap),
dim1 = .umap[, 1],
dim2 = .umap[, 2])
ggp.umap <- mapping.umap %>%
ggplot(aes(x = dim1, y = dim2, colour = .colour)) +
geom_point(alpha = 0.3, size = 0.2) +
theme_bw() +
guides(colour = FALSE) +
labs(title = title)
if(!is.null(label)){
mapping.umap$label = as.factor(train.label)
labels.cent <- mapping.umap %>%
dplyr::group_by(label) %>%
select(dim1, dim2) %>%
summarize_all(mean)
ggp.umap <- ggp.umap +
ggrepel::geom_label_repel(data = labels.cent,
aes(label = label),
label.size = 0.1)
}
invisible(
list(
plot = ggp.umap,
mapping = mapping.umap
)
)
}
pars <- expand.grid(min_dist = c(0.0125, 0.05, 0.2, 0.8),
n_neighbors = c(5, 20, 80, 320))
i = 1
res <- list(NULL)
for(i in 1:NROW(pars)) {
title.txt <- sprintf("%s : %s",
as.character(pars$min_dist[i]),
as.character(pars$n_neighbors[i]))
cat(i, ": ", title.txt, "\n")
res.umap <- train.matrix %>%
uwot::umap(verbose = TRUE,
n_threads = 7,
min_dist = pars$min_dist[i],
n_neighbors = pars$n_neighbors[i])
ggp.umap <- res.umap %>%
plot.umap(title = title.txt, .colour = as.factor(train.label))
res[[i]] <- ggp.umap$plot
}
#> 1 : 0.0125 : 5
#> 01:17:51 Read 10000 rows and found 784 numeric columns
#> 01:17:51 Using Annoy for neighbor search, n_neighbors = 5
#> 01:17:51 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:17:55 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8866976fb5
#> 01:17:55 Searching Annoy index using 7 threads, search_k = 500
#> 01:17:58 Annoy recall = 100%
#> 01:17:58 Commencing smooth kNN distance calibration using 7 threads
#> 01:17:59 Initializing from normalized Laplacian + noise
#> 01:17:59 Commencing optimization for 500 epochs, with 60642 positive edges
#> 01:18:11 Optimization finished
#> 2 : 0.05 : 5
#> 01:18:11 Read 10000 rows and found 784 numeric columns
#> 01:18:11 Using Annoy for neighbor search, n_neighbors = 5
#> 01:18:11 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:18:15 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c88783c3403
#> 01:18:15 Searching Annoy index using 7 threads, search_k = 500
#> 01:18:18 Annoy recall = 100%
#> 01:18:18 Commencing smooth kNN distance calibration using 7 threads
#> 01:18:19 Initializing from normalized Laplacian + noise
#> 01:18:19 Commencing optimization for 500 epochs, with 60642 positive edges
#> 01:18:36 Optimization finished
#> 3 : 0.2 : 5
#> 01:18:36 Read 10000 rows and found 784 numeric columns
#> 01:18:36 Using Annoy for neighbor search, n_neighbors = 5
#> 01:18:36 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:18:42 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c882d671e09
#> 01:18:43 Searching Annoy index using 7 threads, search_k = 500
#> 01:18:48 Annoy recall = 100%
#> 01:18:49 Commencing smooth kNN distance calibration using 7 threads
#> 01:18:49 Initializing from normalized Laplacian + noise
#> 01:18:50 Commencing optimization for 500 epochs, with 60642 positive edges
#> 01:19:10 Optimization finished
#> 4 : 0.8 : 5
#> 01:19:10 Read 10000 rows and found 784 numeric columns
#> 01:19:10 Using Annoy for neighbor search, n_neighbors = 5
#> 01:19:10 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:19:16 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8819c1346a
#> 01:19:16 Searching Annoy index using 7 threads, search_k = 500
#> 01:19:19 Annoy recall = 100%
#> 01:19:19 Commencing smooth kNN distance calibration using 7 threads
#> 01:19:20 Initializing from normalized Laplacian + noise
#> 01:19:20 Commencing optimization for 500 epochs, with 60642 positive edges
#> 01:19:31 Optimization finished
#> 5 : 0.0125 : 20
#> 01:19:32 Read 10000 rows and found 784 numeric columns
#> 01:19:32 Using Annoy for neighbor search, n_neighbors = 20
#> 01:19:32 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:19:35 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c884def6d8a
#> 01:19:35 Searching Annoy index using 7 threads, search_k = 2000
#> 01:19:40 Annoy recall = 100%
#> 01:19:40 Commencing smooth kNN distance calibration using 7 threads
#> 01:19:41 Initializing from normalized Laplacian + noise
#> 01:19:41 Commencing optimization for 500 epochs, with 274406 positive edges
#> 01:20:06 Optimization finished
#> 6 : 0.05 : 20
#> 01:20:06 Read 10000 rows and found 784 numeric columns
#> 01:20:06 Using Annoy for neighbor search, n_neighbors = 20
#> 01:20:06 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:20:09 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c885d0d23f0
#> 01:20:09 Searching Annoy index using 7 threads, search_k = 2000
#> 01:20:14 Annoy recall = 100%
#> 01:20:14 Commencing smooth kNN distance calibration using 7 threads
#> 01:20:15 Initializing from normalized Laplacian + noise
#> 01:20:15 Commencing optimization for 500 epochs, with 274406 positive edges
#> 01:20:40 Optimization finished
#> 7 : 0.2 : 20
#> 01:20:40 Read 10000 rows and found 784 numeric columns
#> 01:20:40 Using Annoy for neighbor search, n_neighbors = 20
#> 01:20:40 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:20:43 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c887b0c5e48
#> 01:20:43 Searching Annoy index using 7 threads, search_k = 2000
#> 01:20:48 Annoy recall = 100%
#> 01:20:48 Commencing smooth kNN distance calibration using 7 threads
#> 01:20:49 Initializing from normalized Laplacian + noise
#> 01:20:49 Commencing optimization for 500 epochs, with 274406 positive edges
#> 01:21:14 Optimization finished
#> 8 : 0.8 : 20
#> 01:21:14 Read 10000 rows and found 784 numeric columns
#> 01:21:14 Using Annoy for neighbor search, n_neighbors = 20
#> 01:21:14 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:21:17 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8862f710d3
#> 01:21:18 Searching Annoy index using 7 threads, search_k = 2000
#> 01:21:23 Annoy recall = 100%
#> 01:21:23 Commencing smooth kNN distance calibration using 7 threads
#> 01:21:24 Initializing from normalized Laplacian + noise
#> 01:21:24 Commencing optimization for 500 epochs, with 274406 positive edges
#> 01:21:49 Optimization finished
#> 9 : 0.0125 : 80
#> 01:21:49 Read 10000 rows and found 784 numeric columns
#> 01:21:49 Using Annoy for neighbor search, n_neighbors = 80
#> 01:21:49 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:21:53 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c886c6c6b0f
#> 01:21:53 Searching Annoy index using 7 threads, search_k = 8000
#> 01:22:01 Annoy recall = 100%
#> 01:22:01 Commencing smooth kNN distance calibration using 7 threads
#> 01:22:02 Initializing from normalized Laplacian + noise
#> 01:22:03 Commencing optimization for 500 epochs, with 1111196 positive edges
#> 01:22:44 Optimization finished
#> 10 : 0.05 : 80
#> 01:22:44 Read 10000 rows and found 784 numeric columns
#> 01:22:44 Using Annoy for neighbor search, n_neighbors = 80
#> 01:22:44 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:22:47 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8831cc1bd5
#> 01:22:48 Searching Annoy index using 7 threads, search_k = 8000
#> 01:22:56 Annoy recall = 100%
#> 01:22:56 Commencing smooth kNN distance calibration using 7 threads
#> 01:22:57 Initializing from normalized Laplacian + noise
#> 01:22:58 Commencing optimization for 500 epochs, with 1111196 positive edges
#> 01:23:40 Optimization finished
#> 11 : 0.2 : 80
#> 01:23:40 Read 10000 rows and found 784 numeric columns
#> 01:23:40 Using Annoy for neighbor search, n_neighbors = 80
#> 01:23:40 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:23:43 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8874e8746
#> 01:23:44 Searching Annoy index using 7 threads, search_k = 8000
#> 01:23:52 Annoy recall = 100%
#> 01:23:52 Commencing smooth kNN distance calibration using 7 threads
#> 01:23:53 Initializing from normalized Laplacian + noise
#> 01:23:54 Commencing optimization for 500 epochs, with 1111196 positive edges
#> 01:24:35 Optimization finished
#> 12 : 0.8 : 80
#> 01:24:35 Read 10000 rows and found 784 numeric columns
#> 01:24:35 Using Annoy for neighbor search, n_neighbors = 80
#> 01:24:35 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:24:39 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c881e4353c
#> 01:24:39 Searching Annoy index using 7 threads, search_k = 8000
#> 01:24:47 Annoy recall = 100%
#> 01:24:47 Commencing smooth kNN distance calibration using 7 threads
#> 01:24:48 Initializing from normalized Laplacian + noise
#> 01:24:49 Commencing optimization for 500 epochs, with 1111196 positive edges
#> 01:25:30 Optimization finished
#> 13 : 0.0125 : 320
#> 01:25:30 Read 10000 rows and found 784 numeric columns
#> 01:25:30 Using Annoy for neighbor search, n_neighbors = 320
#> 01:25:30 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:25:34 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8853143417
#> 01:25:34 Searching Annoy index using 7 threads, search_k = 32000
#> 01:25:50 Annoy recall = 100%
#> 01:25:50 Commencing smooth kNN distance calibration using 7 threads
#> 01:25:54 Initializing from normalized Laplacian + noise
#> 01:25:57 Commencing optimization for 500 epochs, with 4129726 positive edges
#> 01:27:05 Optimization finished
#> 14 : 0.05 : 320
#> 01:27:05 Read 10000 rows and found 784 numeric columns
#> 01:27:05 Using Annoy for neighbor search, n_neighbors = 320
#> 01:27:05 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:27:09 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c8840b2b1e
#> 01:27:09 Searching Annoy index using 7 threads, search_k = 32000
#> 01:27:23 Annoy recall = 100%
#> 01:27:23 Commencing smooth kNN distance calibration using 7 threads
#> 01:27:27 Initializing from normalized Laplacian + noise
#> 01:27:30 Commencing optimization for 500 epochs, with 4129726 positive edges
#> 01:28:35 Optimization finished
#> 15 : 0.2 : 320
#> 01:28:35 Read 10000 rows and found 784 numeric columns
#> 01:28:35 Using Annoy for neighbor search, n_neighbors = 320
#> 01:28:35 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:28:39 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c88701470d1
#> 01:28:39 Searching Annoy index using 7 threads, search_k = 32000
#> 01:28:53 Annoy recall = 100%
#> 01:28:54 Commencing smooth kNN distance calibration using 7 threads
#> 01:28:58 Initializing from normalized Laplacian + noise
#> 01:29:00 Commencing optimization for 500 epochs, with 4129726 positive edges
#> 01:30:06 Optimization finished
#> 16 : 0.8 : 320
#> 01:30:06 Read 10000 rows and found 784 numeric columns
#> 01:30:06 Using Annoy for neighbor search, n_neighbors = 320
#> 01:30:06 Building Annoy index with metric = euclidean, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 01:30:10 Writing NN index file to temp file C:\Users\kato\AppData\Local\Temp\RtmpC6GiEi\file2c884a817cdb
#> 01:30:10 Searching Annoy index using 7 threads, search_k = 32000
#> 01:30:25 Annoy recall = 100%
#> 01:30:25 Commencing smooth kNN distance calibration using 7 threads
#> 01:30:29 Initializing from normalized Laplacian + noise
#> 01:30:32 Commencing optimization for 500 epochs, with 4129726 positive edges
#> 01:31:38 Optimization finished
res[[1]]
ggp.umap.compare <- gridExtra::arrangeGrob(
grobs = res,
ncol = 4
)
ggp.umap.compare
#> TableGrob (4 x 4) "arrange": 16 grobs
#> z cells name grob
#> 1 1 (1-1,1-1) arrange gtable[layout]
#> 2 2 (1-1,2-2) arrange gtable[layout]
#> 3 3 (1-1,3-3) arrange gtable[layout]
#> 4 4 (1-1,4-4) arrange gtable[layout]
#> 5 5 (2-2,1-1) arrange gtable[layout]
#> 6 6 (2-2,2-2) arrange gtable[layout]
#> 7 7 (2-2,3-3) arrange gtable[layout]
#> 8 8 (2-2,4-4) arrange gtable[layout]
#> 9 9 (3-3,1-1) arrange gtable[layout]
#> 10 10 (3-3,2-2) arrange gtable[layout]
#> 11 11 (3-3,3-3) arrange gtable[layout]
#> 12 12 (3-3,4-4) arrange gtable[layout]
#> 13 13 (4-4,1-1) arrange gtable[layout]
#> 14 14 (4-4,2-2) arrange gtable[layout]
#> 15 15 (4-4,3-3) arrange gtable[layout]
#> 16 16 (4-4,4-4) arrange gtable[layout]
res %>% str(1)
#> List of 16
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
#> $ :List of 10
#> ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
ggsave(ggp.umap.compare,
filename = "./output/040_umap_min_dist_vs_n_neighbors.png",
height = 16, width = 16)